Filter criteria:

# # FILTER DE GENES
# load('./../working_data/RNAseq_ASD_4region_normalized_vst.Rdata')
# 
# # Balance Groups by covariates, remove singular batches (none)
# to_keep = (datMeta$Subject_ID != 'AN03345') & !is.na(datMeta$Dx)
# table(to_keep)
# datMeta = datMeta[to_keep,]
# datExpr = datExpr[,to_keep]
# 
# # Select genes differentially expressed in ASD
# mod = model.matrix(~ Dx, data=datMeta)
# corfit = duplicateCorrelation(datExpr, mod, block=datMeta$Subject_ID)
# lmfit = lmFit(datExpr, design=mod, block=datMeta$Subject_ID, correlation=corfit$consensus)
# 
# fit = eBayes(lmfit, trend=T, robust=T)
# top_genes = topTable(fit, coef=2, number=nrow(datExpr))
# ordered_top_genes = top_genes[match(rownames(datExpr), rownames(top_genes)),]
# 
# ASD_pvals = rownames(ordered_top_genes)[ordered_top_genes$adj.P.Val<0.05]     #  6% of genes (3380)
# ASD_lfc = rownames(ordered_top_genes)[abs(ordered_top_genes$logFC)>log2(1.2)] # 13% of genes (6732)
# ASD_pvals_lfc = intersect(ASD_pvals, ASD_lfc)                                 #  6% of genes (2928)
# 
# datExpr = datExpr[match(ASD_pvals_lfc, rownames(datExpr)),]
# 
# datProbes = datProbes[rownames(datProbes) %in% rownames(datExpr),]
# 
# rm(to_keep, corfit, fit, lmfit, mod, ASD_pvals, ASD_lfc, ASD_pvals_lfc, top_genes)
# 
# save(file='./../working_data/RNAseq_ASD_4region_DEgenes_vst_adj_pval_lfc.Rdata', datExpr, datMeta, datProbes, ordered_top_genes)

load('./../working_data/RNAseq_ASD_4region_DEgenes_vst_adj_pval_lfc.Rdata')
DE_info = ordered_top_genes

rm(ordered_top_genes)
glue('Number of genes: ', nrow(datExpr), '\n',
     'Number of samples: ', ncol(datExpr), ' (', sum(datMeta$Diagnosis_=='ASD'), ' ASD, ',
     sum(datMeta$Diagnosis_!='ASD'), ' controls)')
## Number of genes: 3071
## Number of samples: 86 (51 ASD, 35 controls)

Dimensionality reduction using PCA

First principal component explains 93% of the total variance

reduce_dim_datExpr = function(datExpr, datMeta, var_explained=0.98){
  
  datExpr = data.frame(datExpr)
  
  datExpr_pca = prcomp(datExpr, scale.=TRUE)
  last_pc = data.frame(summary(datExpr_pca)$importance[3,]) %>% rownames_to_column(var='ID') %>% 
    filter(.[[2]] >= var_explained) %>% top_n(-1) %>% dplyr::select(ID)
  
  par(mfrow=c(1,2))
  plot(summary(datExpr_pca)$importance[2,], type='b')
  abline(v=substr(last_pc$ID, 3, nchar(last_pc$ID)), col='blue')
  plot(summary(datExpr_pca)$importance[3,], type='b')
  abline(h=var_explained, col='blue')
  
  
  print(glue('Keeping top ', substr(last_pc$ID, 3, nchar(last_pc$ID)), ' components that explain ',
             var_explained*100, '% of the variance'))
  
  datExpr_top_pc = datExpr_pca$x %>% data.frame %>% dplyr::select(PC1:last_pc$ID)
  
  return(list('datExpr'=datExpr_top_pc, 'pca_output'=datExpr_pca))
}

reduce_dim_output = reduce_dim_datExpr(datExpr, datMeta)

## Keeping top 6 components that explain 98% of the variance
datExpr_redDim = reduce_dim_output$datExpr
pca_output = reduce_dim_output$pca_output

rm(datSeq, datProbes, reduce_dim_datExpr, reduce_dim_output, datExpr)

Genes are still separated into two clouds of points:

datExpr_redDim %>% ggplot(aes(x=PC1, y=PC2)) + geom_point() + theme_minimal()

Projecting all the original points into the space created by the two principal components and colouring by the differential expression p-value we can see that the points in the middle of the two clouds were filtered out because their DE wasn’t statistically significant. Colouring by their log2 fold change we can see that the genes from the cloud on the top are overexpressed and the genes in the bottom one underexpressed.

datMeta_backup = datMeta # So datMeta doesn't get replaced with unfiltered version
load('./../working_data/RNAseq_ASD_4region_normalized_vst.Rdata')
datMeta = datMeta_backup

ASD_pvals = DE_info$adj.P.Val
log_fold_change = DE_info$logFC

pca_data_projection = scale(datExpr) %*% pca_output$rotation %>% data.frame
p1 = pca_data_projection %>% ggplot(aes(x=PC1, y=PC2, color=ASD_pvals)) + geom_point(alpha=0.5) + 
     theme_minimal() + theme(legend.position='bottom')
p2 = pca_data_projection %>% ggplot(aes(x=PC1, y=PC2, color=log_fold_change)) + geom_point() + 
     theme_minimal() + theme(legend.position='bottom') + scale_colour_gradient2()
grid.arrange(p1, p2, ncol=2)

rm(ASD_pvals, log_fold_change, top_genes, datExpr, datProbes, datMeta_backup, p1, p2)

SFARI genes dataset

Only 159 of our 3071 genes appear on the SFARI list, losing all 23/25 genes with a score of 1

SFARI_genes = read_csv('./../working_data/SFARI_genes_01-15-2019.csv', 
    col_types = cols(`number-of-reports` = col_integer(), syndromic = col_logical()))

Gene Score count considering all genes:

SFARI_genes$`gene-score` %>% table
## .
##   1   2   3   4   5   6 
##  25  62 195 454 175  25
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',
               dataset='hsapiens_gene_ensembl',
               host='feb2014.archive.ensembl.org') ## Gencode v19

gene_names = getBM(attributes=c('ensembl_gene_id', 'hgnc_symbol'), filters=c('hgnc_symbol'), 
                   values=SFARI_genes$`gene-symbol`, mart=mart) %>% mutate('gene-symbol'=hgnc_symbol) %>% 
                   dplyr::select('ensembl_gene_id', 'gene-symbol')

SFARI_genes = left_join(SFARI_genes, gene_names, by='gene-symbol')

gene_scores = SFARI_genes %>% dplyr::select(ensembl_gene_id, `gene-score`, syndromic) %>% 
              right_join(gene_names, by='ensembl_gene_id') %>% dplyr::select(-'gene-symbol')

# Full (ordered) list of datExpr genes with their scores
gene_scores = data.frame('ensembl_gene_id'=rownames(datExpr_redDim)) %>% 
              left_join(gene_scores, by = 'ensembl_gene_id') %>%
              mutate('syndromic' = ifelse(syndromic==FALSE | is.na(syndromic), FALSE, TRUE),
                     'gene-bool' = ifelse(is.na(`gene-score`), FALSE, TRUE))

rm(SFARI_genes, mart, gene_names)

Gene score count for genes in datExpr

gene_scores$`gene-score` %>% table
## .
##  1  2  3  4  5  6 
##  2  5 29 74 44  5

Clustering

clusterings = list()

clusterings[['SFARI_score']] = gene_scores$`gene-score`
names(clusterings[['SFARI_score']]) = gene_scores$ensembl_gene_id

clusterings[['SFARI_bool']] = gene_scores$`gene-bool`
names(clusterings[['SFARI_bool']]) = gene_scores$ensembl_gene_id

clusterings[['syndromic']] = gene_scores$syndromic
names(clusterings[['syndromic']]) = gene_scores$ensembl_gene_id

K-means Clustering

set.seed(123)
wss = sapply(1:10, function(k) kmeans(datExpr_redDim, k, iter.max=100, nstart=25,
                                      algorithm='MacQueen')$tot.withinss)
plot(wss, type='b', main='K-Means Clustering')
best_k = 3
abline(v = best_k, col='blue')

datExpr_k_means = kmeans(datExpr_redDim, best_k, iter.max=100, nstart=25)
clusterings[['km']] = datExpr_k_means$cluster

Hierarchical Clustering

Chose k=7 as best number of clusters. SFARI genes seem to group in the last two clusters

h_clusts = datExpr_redDim %>% dist %>% hclust
# plot(h_clusts, hang = -1, cex = 0.6, labels=FALSE)
best_k = 7
clusterings[['hc']] = cutree(h_clusts, best_k)

create_viridis_dict = function(){
  min_score = clusterings[['SFARI_score']] %>% min(na.rm=TRUE)
  max_score = clusterings[['SFARI_score']] %>% max(na.rm=TRUE)
  viridis_score_cols = viridis(max_score - min_score + 1)
  names(viridis_score_cols) = seq(min_score, max_score)
  
  return(viridis_score_cols)
}

viridis_score_cols = create_viridis_dict()

dend_meta = gene_scores[match(labels(h_clusts), gene_scores$ensembl_gene_id),] %>% 
            mutate('SFARI_score' = viridis_score_cols[`gene-score`],                   # Purple: 2, Yellow: 6
                   'SFARI_bool' = ifelse(`gene-bool` == T, '#21908CFF', 'white'), # Acqua
                   'Syndromic' = ifelse(syndromic == T, 'orange', 'white')) %>% 
            dplyr::select(SFARI_score, SFARI_bool, Syndromic)

h_clusts %>% as.dendrogram %>% set('labels', rep('', nrow(datMeta))) %>% 
             set('branches_k_color', k=best_k) %>% plot
colored_bars(colors=dend_meta)

Consensus Clustering

Samples are grouped into two big clusters, two small clusters and two outliers, the first big cluster has one main subcluster, two small subclusters and three outliers, and the second one has one main subcluster, one small one and three groups of outliers.

*Output plots in clustering_genes_04_04 folder



Independent Component Analysis

Following this paper’s guidelines:

  1. Run PCA and keep enough components to explain 60% of the variance

  2. Run ICA with that same number of nbComp as principal components kept to then filter them

  3. Select components with kurtosis > 3

  4. Assign genes to clusters with FDR<0.01 using the fdrtool package

ICA_output = datExpr_redDim %>% runICA(nbComp=ncol(datExpr_redDim), method='JADE')
signals_w_kurtosis = ICA_output$S %>% data.frame %>% dplyr::select(names(apply(ICA_output$S, 2, kurtosis)>3))
ICA_clusters = apply(signals_w_kurtosis, 2, function(x) fdrtool(x, plot=F)$qval<0.01) %>% data.frame
ICA_clusters = ICA_clusters[colSums(ICA_clusters)>1]

clusterings[['ICA_min']] = rep(NA, nrow(ICA_clusters))
ICA_clusters_names = colnames(ICA_clusters)
for(c in ICA_clusters_names) clusterings[['ICA_min']][ICA_clusters[,c]] = c

clusterings[['ICA_NA']] = is.na(clusterings[['ICA_min']])

Leaves most of the observations (~92%) without a cluster (12% more than cqn normalisation):

ICA_clusters %>% rowSums %>% table
## .
##    0    1    2 
## 2830  232    9
ICA_clusters %>% mutate(cl_sum=rowSums(.)) %>% as.matrix %>% melt %>% ggplot(aes(Var2, Var1)) + 
  geom_tile(aes(fill=value)) + xlab('Clusters') + ylab('Samples') + 
  theme_minimal() + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) + coord_flip()

WGCNA

The soft R-squared value starts in 0.7 but then becomes tiny and needs a power of over 150 to recover. Taking power=1

best_power = datExpr_redDim %>% t %>% pickSoftThreshold(powerVector = seq(1, 10, by=1))
##    Power SFT.R.sq slope truncated.R.sq mean.k. median.k. max.k.
## 1      1   0.7000 2.250          0.696    2360      2590   2670
## 2      2   0.4420 0.983          0.729    2040      2330   2460
## 3      3   0.2710 0.616          0.805    1830      2140   2320
## 4      4   0.1870 0.446          0.826    1670      2000   2220
## 5      5   0.1190 0.335          0.842    1550      1870   2130
## 6      6   0.0929 0.276          0.860    1460      1770   2050
## 7      7   0.0635 0.218          0.880    1370      1670   1990
## 8      8   0.0425 0.172          0.885    1300      1590   1930
## 9      9   0.0300 0.136          0.886    1240      1510   1880
## 10    10   0.0187 0.103          0.893    1190      1450   1830
network = datExpr_redDim %>% t %>% blockwiseModules(power=1, numericLabels=TRUE)
clusterings[['WGCNA']] = network$colors
names(clusterings[['WGCNA']]) = rownames(datExpr_redDim)

It only leaves 68 genes without a cluster but classifies almost all points into the same class:

clusterings[['WGCNA']] %>% table
## .
##    0    1    2 
##   68 2921   82


Gaussian Mixture Models with hard thresholding

Number of clusters that resemble more Gaussian mixtures = 39

n_clust = datExpr_redDim %>% Optimal_Clusters_GMM(max_clusters=50, criterion='BIC', plot_data=FALSE)
plot(n_clust, type='l', main='Bayesian Information Criterion to choose number of clusters')

best_k = 27
gmm = datExpr_redDim %>% GMM(best_k)
clusterings[['GMM']] = gmm$Log_likelihood %>% apply(1, which.max)

Plot of clusters with their centroids in gray

gmm_points = rbind(datExpr_redDim, setNames(data.frame(gmm$centroids), names(datExpr_redDim)))
gmm_labels = c(clusterings[['GMM']], rep(NA, best_k)) %>% as.factor
ggplotly(gmm_points %>% ggplot(aes(x=PC1, y=PC2, color=gmm_labels)) + geom_point() + theme_minimal())

Trying with 8 clusters

best_k = 8
gmm = datExpr_redDim %>% GMM(best_k)
clusterings[['GMM_2']] = gmm$Log_likelihood %>% apply(1, which.max)
clusterings[['GMM_2']] %>% table
## .
##   1   2   3   4   5   6   7   8 
## 292 520 455 581 265 286 330 342


Manual clustering

Separate the two clouds of points by a straight line. There seems to be a difference in the mean expression of the genes between clusters but not in their standard deviation.

# Modify equation sign if needed so blue cluter represents overexpressed genes
manual_clusters = as.factor(as.numeric(-0.1*datExpr_redDim$PC1 + 0.05 > datExpr_redDim$PC2))
datExpr_redDim %>% ggplot(aes(x=PC1, y=PC2, color=manual_clusters)) + geom_point() + 
  geom_abline(slope=-0.1, intercept=0.05, color='gray') + theme_minimal()

names(manual_clusters) = rownames(datExpr_redDim)

clusterings[['Manual']] = manual_clusters

clusterings[['Manual']] %>% table
## .
##    0    1 
## 1468 1603

Both clusters could be GM with 2 gaussians each in the means and 3 in the sd.

manual_clusters_data = cbind(apply(datExpr_redDim, 1, mean), apply(datExpr_redDim, 1, sd), 
                             manual_clusters) %>% data.frame
colnames(manual_clusters_data) = c('mean','sd','cluster')
manual_clusters_data = manual_clusters_data %>% mutate('cluster'=as.factor(cluster))
p1 = manual_clusters_data %>% ggplot(aes(x=mean, color=cluster, fill=cluster)) + 
  geom_density(alpha=0.4) + theme_minimal()
p2 = manual_clusters_data %>% ggplot(aes(x=sd, color=cluster, fill=cluster)) + 
  geom_density(alpha=0.4) + theme_minimal()
grid.arrange(p1, p2, ncol=2)

Separate clusters into two Gaussians per diagnosis by their mean:

gg_colour_hue = function(n) {
  hues = seq(15, 375, length = n + 1)
  hcl(h = hues, l = 65, c = 100)[1:n]
}

n_clusters = 2

c1_mean = manual_clusters_data %>% filter(cluster==1) %>% dplyr::select(mean)
rownames(c1_mean) = rownames(manual_clusters_data)[manual_clusters_data$cluster=='1']
gmm_c1_mean = c1_mean %>% GMM(n_clusters)

c2_mean = manual_clusters_data %>% filter(cluster==2) %>% dplyr::select(mean)
rownames(c2_mean) = rownames(manual_clusters_data)[manual_clusters_data$cluster=='2']
gmm_c2_mean = c2_mean %>% GMM(n_clusters)

clusterings[['Manual_mean']] = as.character(clusterings[['Manual']])
clusterings[['Manual_mean']][clusterings[['Manual']]==0] = gmm_c1_mean$Log_likelihood %>% 
  apply(1, function(x) glue('1_',which.max(x)))
clusterings[['Manual_mean']][clusterings[['Manual']]==1] = gmm_c2_mean$Log_likelihood %>% 
  apply(1, function(x) glue('2_',which.max(x)))


plot_gaussians = manual_clusters_data %>% ggplot(aes(x=mean)) +
  stat_function(fun=dnorm, n=100, colour=gg_colour_hue(2*n_clusters)[1], # red
                args=list(mean=gmm_c1_mean$centroids[1], sd=gmm_c1_mean$covariance_matrices[1])) +
  stat_function(fun=dnorm, n=100, colour=gg_colour_hue(2*n_clusters)[2], # green
                args=list(mean=gmm_c1_mean$centroids[2], sd=gmm_c1_mean$covariance_matrices[2])) +
  stat_function(fun=dnorm, n=100, colour=gg_colour_hue(2*n_clusters)[3], # blue
                args=list(mean=gmm_c2_mean$centroids[1], sd=gmm_c2_mean$covariance_matrices[1])) +
  stat_function(fun=dnorm, n=100, colour=gg_colour_hue(2*n_clusters)[4], # purple
                args=list(mean=gmm_c2_mean$centroids[2], sd=gmm_c2_mean$covariance_matrices[2])) +
  theme_minimal()

plot_points = datExpr_redDim %>% ggplot(aes_string(x='PC1', y='PC2', color=as.factor(clusterings[['Manual_mean']]))) + 
          geom_point() + theme_minimal()

grid.arrange(plot_gaussians, plot_points, ncol=2)

clusterings[['Manual_sd']] %>% table
## < table of extent 0 >

Separate clusters into three Gaussians per diagnosis by their sd:

n_clusters = 3

c1_sd = manual_clusters_data %>% filter(cluster==1) %>% dplyr::select(sd)
rownames(c1_sd) = rownames(manual_clusters_data)[manual_clusters_data$cluster=='1']
gmm_c1_sd = c1_sd %>% GMM(n_clusters)

c2_sd = manual_clusters_data %>% filter(cluster==2) %>% dplyr::select(sd)
rownames(c2_sd) = rownames(manual_clusters_data)[manual_clusters_data$cluster=='2']
gmm_c2_sd = c2_sd %>% GMM(n_clusters)

clusterings[['Manual_sd']] = as.character(clusterings[['Manual']])
clusterings[['Manual_sd']][clusterings[['Manual']]==0] = gmm_c1_sd$Log_likelihood %>% 
  apply(1, function(x) glue('1_',which.max(x)))
clusterings[['Manual_sd']][clusterings[['Manual']]==1] = gmm_c2_sd$Log_likelihood %>% 
  apply(1, function(x) glue('2_',which.max(x)))


plot_gaussians = manual_clusters_data %>% ggplot(aes(x=sd)) +
  stat_function(fun=dnorm, n=100, colour=gg_colour_hue(2*n_clusters)[1], # 
                args=list(mean=gmm_c1_sd$centroids[1], sd=gmm_c1_sd$covariance_matrices[1])) +
  stat_function(fun=dnorm, n=100, colour=gg_colour_hue(2*n_clusters)[2], # 
                args=list(mean=gmm_c1_sd$centroids[2], sd=gmm_c1_sd$covariance_matrices[2])) +
  stat_function(fun=dnorm, n=100, colour=gg_colour_hue(2*n_clusters)[3], # 
                args=list(mean=gmm_c1_sd$centroids[3], sd=gmm_c1_sd$covariance_matrices[3])) +
  stat_function(fun=dnorm, n=100, colour=gg_colour_hue(2*n_clusters)[4], # 
                args=list(mean=gmm_c2_sd$centroids[1], sd=gmm_c2_sd$covariance_matrices[1])) +
  stat_function(fun=dnorm, n=100, colour=gg_colour_hue(2*n_clusters)[5], # 
                args=list(mean=gmm_c2_sd$centroids[2], sd=gmm_c2_sd$covariance_matrices[2])) +
  stat_function(fun=dnorm, n=100, colour=gg_colour_hue(2*n_clusters)[6], # 
                args=list(mean=gmm_c2_sd$centroids[3], sd=gmm_c2_sd$covariance_matrices[3])) +
  theme_minimal()


plot_points = datExpr_redDim %>% ggplot(aes_string(x='PC1', y='PC2', color=as.factor(clusterings[['Manual_sd']]))) + 
          geom_point() + theme_minimal()

grid.arrange(plot_gaussians, plot_points, ncol=2)

clusterings[['Manual_sd']] %>% table
## .
## 1_1 1_2 1_3 2_1 2_2 2_3 
## 580 363 525 535 621 447
rm(c1_sd, c2_sd, gmm_c1_sd, gmm_c2_sd)
# Clean up the environment a bit
rm(wss, datExpr_k_means, h_clusts, cc_output, cc_output_c1, cc_output_c2, best_k, ICA_output, 
   ICA_clusters_names, signals_w_kurtosis, n_clust, gmm, gmm_points, gmm_labels, network, 
   best_power, c, manual_clusters, manual_clusters_data, c1_sd, c2_sd, c1_mean, c2_mean, 
   gmm_c1_sd, gmm_c2_sd,gmm_c1_sd, gmm_c1_mean, gmm_c2_mean, p1, p2, pca_data_projection, dend_meta, 
   plot_gaussians, plot_points, n_clusters, viridis_score_cols, gg_colour_hue, create_viridis_dict)

Compare clusterings

Using Adjusted Rand Index:

cluster_sim = data.frame(matrix(nrow = length(clusterings), ncol = length(clusterings)))
for(i in 1:(length(clusterings))){
  cluster1 = as.factor(clusterings[[i]])
  for(j in (i):length(clusterings)){
    cluster2 = as.factor(clusterings[[j]])
    cluster_sim[i,j] = adj.rand.index(cluster1, cluster2)
  }
}
colnames(cluster_sim) = names(clusterings)
rownames(cluster_sim) = colnames(cluster_sim)

cluster_sim = cluster_sim %>% as.matrix %>% round(2)
heatmap.2(x = cluster_sim, Rowv = FALSE, Colv = FALSE, dendrogram = 'none', 
          cellnote = cluster_sim, notecol = 'black', trace = 'none', key = FALSE, 
          cexRow = 1, cexCol = 1, margins = c(7,7))

rm(i, j, cluster1, cluster2, cluster_sim)

Scatter plots

plot_points = datExpr_redDim %>% data.frame() %>% dplyr::select(PC1:PC3) %>%
              mutate(ID = rownames(.),                               k_means = as.factor(clusterings[['km']]),
                hc = as.factor(clusterings[['hc']]),                 cc_l1 = as.factor(clusterings[['cc_l1']]),
                cc_l2 = as.factor(clusterings[['cc_l2']]),           ica = as.factor(clusterings[['ICA_min']]),
                n_ica = as.factor(rowSums(ICA_clusters)),            gmm = as.factor(clusterings[['GMM']]),
                gmm_2 = as.factor(clusterings[['GMM_2']]),           wgcna = as.factor(clusterings[['WGCNA']]),    
                manual = as.factor(clusterings[['Manual']]),         manual_mean = as.factor(clusterings[['Manual_mean']]),
                manual_sd = as.factor(clusterings[['Manual_sd']]),   SFARI = as.factor(clusterings[['SFARI_score']]),
                SFARI_bool = as.factor(clusterings[['SFARI_bool']]), syndromic = as.factor(clusterings[['syndromic']])) %>%
              bind_cols(DE_info[rownames(DE_info) %in% rownames(datExpr_redDim),])
selectable_scatter_plot(plot_points, plot_points)